graphTweets is available on CRAN and github.
# from CRAN
install.packages("graphTweets")
# from github
devtools::install_github("JohnCoene/graphTweets")
# load
library(graphTweets)graphTweets only comes with three functions
package?graphTweetsSet yourself up with twitteR or streamR to fetch tweets (you may of course import data in some other way).
Data sets used in these slides:
Here I use the tiwtteR package to get 500 tweets on “#rstats”.
library(twitteR)
# replace with your details
setup_twitter_oauth(consumer_key, consumer_secret, access_token,
access_secret)
r_tw <- searchTwitter("rstats", n = 500)
# unlist tweets to data.frame
r_tw <- twListToDF(r_tw)download this dataset.
We can now get the list of edges using graphTweets.
# get edges
edges <- graphTweets::getEdges(r_tw, "text", "screenName")
# remove dupliactes
edges <- edges[!duplicated(edges),]We may then use igraph to build and plot the network.
# build
g <- igraph::graph.data.frame(edges, directed = TRUE)
# plot
igraph::plot.igraph(g, layout = layout.fruchterman.reingold(g))You may also use the networkD3 package.
networkD3::simpleNetwork(edges, Source = "source", Target = "target")Alternatively one may use the igraph package to save the graph and import it in another software.
For instance you may want to save the graph as graphml.
write.graph(g, file = "graphTweets.graphml", format = "graphml")Then open it in Gephi.
Let’s use the same data set as in the previous example but this time add meta-data to the nodes.
# graphTweets
edges <- getEdges(r_tw, source = "screenName", tweets = "text",
str.length = NULL, "favoriteCount")
nodes <- getNodes(edges, source = "source", target = "target", "favoriteCount")
# replace NA with 0
nodes$retweetCount[is.na(nodes$retweetCount)] <- 0
# build graph
g <- igraph::graph.data.frame(edges, directed = TRUE, vertices = nodes)Now we can use the meta-data (favoriteCount) in the plot.
# scale vertices color to # of favorites
pal <- RColorBrewer:::brewer.pal(5, "Dark2")
V(g)$color <- colorRampPalette(pal)(length(V(g)$favoriteCount))
# plot vertex size ~ degree
plot(g, layout = layout.fruchterman.reingold(g), vertex.color = V(g)$color,
vertex.size = log1p(degree(g)) * 3)This time we’ll get tweets on the US Presidential 2016 (writing this on 2016-03-01)
# search terms 2016 presidential
st <- c("hillary", "clinton", "bernie", "sanders", "donald", "trump", "jeb",
"bush", "ted", "cruz", "marco", "rubio", "carson", "christie")
presi <- data.frame()
for(i in 10:length(st)) {
# tweets from the US
tw <- searchTwitter(st[i], n = 5000, geocode = "39.554883,-99.931641,1000mi")
tw <- twListToDF(tw)
presi <- plyr::rbind.fill(presi, tw)
}download this dataset.
We pass additional variables (...) to the getEdges function; tweets’ retweet count and their geo-coordinates so that we can later layout the network on a map rather than a canvas.
# remove NA
presi <- presi[!is.na(presi$longitude),]
edges <- getEdges(data = presi, tweets = "text", source = "screenName",
str.length = NULL, "longitude", "latitude", "retweetCount")We then pass the geo-coordinates and retweet counts as meta-data to our vertices using getNodes.
nodes <- getNodes(edges, source = names(edges)[1], target = names(edges)[2],
"longitude", "latitude", "retweetCount")Here I demonstrate with plotly as igraph cannot—on my machine—properly handle 3728 nodes.
# join nodes and edges and rename
edges <- dplyr::inner_join(edges[,1:2], nodes[,1:3],
by = c("source" = "nodes"))
edges <- dplyr::inner_join(edges, nodes[,1:3], by = c("target" = "nodes"))
names(edges)[3:ncol(edges)] <- c("lon_start", "lat_start", "lon_end",
"lat_end")
# remove NA
edges <- edges[!is.na(edges$lon_end),]
# edges frequency to use for opacity in plot
edges$freq <- 1
edges <- plyr::ddply(edges, c("source", "target", "lon_start", "lat_start",
"lon_end", "lat_end"),
plyr::summarise, freq = sum(freq))
# add id for plotly
edges$id <- 1:nrow(edges)
# plot
plot_ly(nodes, lon = longitude, lat = latitude, type = 'scattergeo',
locationmode = 'USA-states',
marker = list(size = log1p(retweetCount * 3)*5, color = 'red'),
inherit = FALSE) %>%
add_trace(lon = list(lon_start, lon_end), lat = list(lat_start, lat_end)
group = id, opacity = 1, data = edges,
mode = 'lines', line = list(width = freq, color = '#DD1C1A'),
type = 'scattergeo', locationmode = 'USA-states') %>%
layout(title = 'graphTweets',
geo = list(scope = 'north america',
projection = list(type = 'azimuthal equal area')),
showlegend = FALSE)Detect communities in our graph. Somewhat useful though it is not a giant component like ego networks.
edges <- getEdges(data = r_tw, tweets = "text", source = "screenName",
str.length = NULL)
nodes <- getNodes(edges, source = "source", target = "target")
g <- igraph::graph.data.frame(edges, directed = TRUE, vertices = nodes)
# get communities
cm <- walktrap.community(g)Plotting communities with igraph cound’t be easier.
plot(g, vertex.color = membership(cm), vertex.size = log1p(degree(g)) * 3)Temporal graphs are do-able within R but remain clunky; one has to plot a graph frame by frame and turn it into a gif.
graphTweets let’s you build a temporal graph in R and easily export to/open it in Gephi; you only need tweets and dynamise.
dyn <- dynamise(data = presi, tweets = "text", source = "screenName",
start.stamp = "created")dynamise returns an object of class graph and though therefore can be plotted using plot it will not be dynamic. To take view the dynamic graph the network must be open in Gephi. Thankfully the function makes it extremely easy.
To view the graph in gephi write the file (see ?dynamise for more options), you may additionally open it in Gephi from R itself with open = TRUE
dyn <- dynamise(data = presi, tweets = "text", source = "screenName",
start.stamp = "created", write = TRUE, open = TRUE)To make the graph dynamic in Gephi a few more steps are required. The following slides essentially go through the steps detailed here on slides 25 and 26.
In the data laboratory for both the “nodes” and “edges” tables merge the columns “start.stamp” and “end.stamp”, select “create time interval” in the dropdown.
In the following window select the appropriate columns and check “parse dates”, If you followed the steps in the previous slides in creating the graph and used the created column as start.stamp in dynamise use yyyy-MM-dd HH:mm:ss as date format (this is essential POSIXct/POSIXlt class in R)
You should now be able to “enable timeline” at the bottom of the screen. Then again, if you followed the steps in the previous slides in creating the graph then in the bottom left corner hit the gear icon and hit “set time format”, change it to “Datetime”.
That’s it, mess around with the timeline.
By default dynamise keeps edges and nodes forever, which is unrealistic. Tweets do not last forever; use end.stamp to specify the delay.
Since we pass in an array of class POSIXct we can specify a delay in seconds like so:
# delay of an hour
dyn <- dynamise(data = presi, tweets = "text", source = "screenName",
start.stamp = "created", end.stamp = 3600, write = TRUE,
open = TRUE)The above will build a graph where tweets “last” one hour.
Sys.time()## [1] "2016-03-01 17:41:25 CST"
Sys.time() + 3600## [1] "2016-03-01 18:41:25 CST"
If we pass in a date we can specify a delay by day.
presi$date <- as.Date(presi$created)
# delay of an hour
dyn <- dynamise(data = presi, tweets = "text", source = "screenName",
start.stamp = "created", end.stamp = 1, write = TRUE,
open = TRUE)The above will build a graph where tweets “last” one day.
If you encounter issues go to, Github issues.
For the manual go on CRAN.
See my other packages and projects at http://johncoene.github.io/projects/